#First pass at audit Validation graphs
library(readr)
library(haven)
library(tidyverse)
library(magrittr)

#Define functions for later use
add_narm <- function(a,b,c,d,e,f){
  a = ifelse(is.na(a)==TRUE,0,a)
  b = ifelse(is.na(b)==TRUE,0,b)
  c = ifelse(is.na(c)==TRUE,0,c)
  d = ifelse(is.na(d)==TRUE,0,d)
  e = ifelse(is.na(e)==TRUE,0,e)
  f = ifelse(is.na(f)==TRUE,0,f)
  a+b+c+d+e+f
}

#function to remove quotations from a name
quotes_remove <- function(x){
  name <- str_split(x,"'")
  name_fixed <- rep(NA,length(x))
  for(i in 1:length(x)){
    name_fixed[i] <- name[[i]][2]
  }
  name_fixed <- str_trim(name_fixed,side="right")
}

sum_narm <- function(x){
  sum(x,na.rm = TRUE)
}

mean_narm <- function(x){
  mean(x,na.rm = TRUE)
}

#Load the data
reg_data <- read.csv("Z:/Race_Prediction/Data/NTJ Short Paper/reg_data.csv")
load("X:/public/demographic_tax_data/LIHTC/Data/CI_boot_samples/sp_summaries/all.RData")
CI_data <- as.data.frame(diffs.audit.incbin.clust)
CI_data$age_bin <- as.integer(CI_data$agi.bin.filers)

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# Second stage Audit rate validation graphs

audit <- subset(reg_data,reg_data$filer==1)
audit$audit_ind <- ifelse(audit$audit==TRUE,1,0)

#Define graph data
bisg_audit_ind <- c(mean(subset(audit$audit_ind_ind,audit$demographic==1)),
                   sum(audit$audit_ind_ind*audit$prob_b1)/sum(audit$prob_b1),
                   mean(subset(audit$audit_ind_ind,audit$demographic==2)),
                   sum(audit$audit_ind_ind*audit$prob_b2)/sum(audit$prob_b2),
                   mean(subset(audit$audit_ind_ind,audit$demographic==3)),
                   sum(audit$audit_ind_ind*audit$prob_b3)/sum(audit$prob_b3),
                   mean(subset(audit$audit_ind_ind,audit$demographic==4)),
                   sum(audit$audit_ind_ind*audit$prob_b4)/sum(audit$prob_b4),
                   mean(subset(audit$audit_ind_ind,audit$demographic==6)),
                   sum(audit$audit_ind_ind*audit$prob_b6)/sum(audit$prob_b6))

bisg_audit_ind <- bisg_audit_ind*100
round(bisg_audit_ind,2)

label1 <- c("White","Black","ANHPI","Native","Hispanic")
label2 <- c("","","","","") 

#Graph Audit differences truth versus predicted
par(mfrow=c(1,1))

#Graphing colors
library(RColorBrewer)
clr <- brewer.pal(8,"Set1")

#Plots with error bars
library(gplots)

bisg_m <- matrix(data = bisg_audit_ind, nrow=2, ncol=5)

CI_main <- as.data.frame(sum.audit.clust)
CI_main <- subset(CI_main,CI_main$boot_draw_i!=5)
upper <- matrix(data = c(NA,CI_main$CI_high[1],NA,CI_main$CI_high[2],NA,CI_main$CI_high[3],NA,CI_main$CI_high[4],NA,CI_main$CI_high[5])*100, nrow=2, ncol=5)
lower <- matrix(data = c(NA,CI_main$CI_low[1],NA,CI_main$CI_low[2],NA,CI_main$CI_low[3],NA,CI_main$CI_low[4],NA,CI_main$CI_low[5])*100, nrow=2, ncol=5)

names <- c("White","Black","ANHPI","Native","Hispanic")

par(bg=NA)
bp <- barplot2(bisg_m, beside = TRUE, horiz = FALSE, 
               plot.ci = TRUE, ci.u = upper, ci.l = lower, ci.lwd=4, border="white",ci.color = "orange",
               names.arg = names, col = c("lightskyblue", "firebrick"), xlim = c(1, 15),yaxt="n",ylab="Percent",
               ylim=c(0,2),cex.names = 1.2)
abline(h=0,col="black")
abline(h=0,col="black")
axis(side=2, at=c(0,0.5,1,1.5,2),cex=1.1,
     labels=c("0%","0.5%","1.0%","1.5%","2.0%"))
legend("topleft", c("Observed","BIFSG"), pch=15, 
       col=c("lightskyblue","firebrick"),
       bty="n",cex=1.1)


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Differences by AGI category: Non-White - White
# Audit analysis restricted to filers 

audit <- subset(reg_data,reg_data$filer==1)
audit$audit_ind <- ifelse(audit$audit==TRUE,1,0)

#Define AGI categories
cutoffs <- rep(0,10)
for(i in 1:10){
  cutoffs[i] <- quantile(audit$agi,(0.1*i))
}

audit %<>%
  mutate(agi_cat = ifelse(agi<cutoffs[1],1,NA),
         agi_cat = ifelse(agi>=cutoffs[1] & agi<cutoffs[2],2,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[2] & agi<cutoffs[3],3,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[3] & agi<cutoffs[4],4,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[4] & agi<cutoffs[5],5,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[5] & agi<cutoffs[6],6,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[6] & agi<cutoffs[7],7,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[7] & agi<cutoffs[8],8,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[8] & agi<cutoffs[9],9,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[9],10,agi_cat))


#Difference in conditional means between black and white
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b2,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="2_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="2_1")

xvals <- seq(0.1,1,0.1)

#Graph audit Take-up truth versus predicted
par(mfrow=c(2,2))
par(bg=NA)

#Graphing colors
library(RColorBrewer)
clr <- brewer.pal(8,"Set1")

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.02,0.06),col="white",
     main="Black - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=3)
lines(xvals,predicted,col= "firebrick",lwd=3)
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1,
       bty="n")




#Difference in conditional means between hispanic and white
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b6,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="6_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="6_1")

xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.02,0.06),col="white",
     main="Hispanic - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=3)
lines(xvals,predicted,col= "firebrick",lwd=3)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1,
       bty="n")



#Difference in conditional means between ANHPI and white
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b3,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="3_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="3_1")

xvals <- seq(0.1,1,0.1)



# Draw the two bar chart using above datasets
plot(xvals,true_val,col="white", ylim=c(-0.02,0.06),
     main="ANHPI - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.3)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=3)
lines(xvals,predicted,col= "firebrick",lwd=3)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1,
       bty="n")



#Difference in conditional means between Native and white
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

xvals <- seq(0.1,1,0.1)

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="4_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="4_1")

# Draw the two bar chart using above datasets
plot(xvals,true_val,col="white", ylim=c(-0.02,0.06),
     main="Native - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=3)
lines(xvals,predicted,col= "firebrick",lwd=3)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1,
       bty="n")



#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Differences by AGI category: Non-White compared to Non-White
audit <- subset(reg_data,reg_data$filer==1)
audit$audit_ind <- ifelse(audit$audit==TRUE,1,0)

#Define AGI categories

cutoffs <- rep(0,10)
for(i in 1:10){
  cutoffs[i] <- quantile(audit$agi,(0.1*i))
}

audit %<>%
  mutate(agi_cat = ifelse(agi<cutoffs[1],1,NA),
         agi_cat = ifelse(agi>=cutoffs[1] & agi<cutoffs[2],2,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[2] & agi<cutoffs[3],3,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[3] & agi<cutoffs[4],4,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[4] & agi<cutoffs[5],5,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[5] & agi<cutoffs[6],6,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[6] & agi<cutoffs[7],7,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[7] & agi<cutoffs[8],8,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[8] & agi<cutoffs[9],9,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[9],10,agi_cat))


#++++++++++++ Graph other differences against each other
#Difference in conditional means between Black and Hispanic
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b2,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b6,audit$agi_cat==i)))
}

CI_low <- -1*subset(CI_data$CI_lower,CI_data$diff=="6_2")
CI_high <- -1*subset(CI_data$CI_upper,CI_data$diff=="6_2")

xvals <- seq(0.1,1,0.1)

#Graphing colors
library(RColorBrewer)
clr <- brewer.pal(8,"Set1")

#Graph audit Take-up truth versus predicted
par(mfrow=c(3,2))
par(bg=NA)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.035,0.065),col="white",
     main="Black - Hispanic",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1.1,
       bty="n")



#Difference in conditional means between Black and Asian
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b2,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b3,audit$agi_cat==i)))
}

CI_low <- -1*subset(CI_data$CI_lower,CI_data$diff=="3_2")
CI_high <- -1*subset(CI_data$CI_upper,CI_data$diff=="3_2")

xvals <- seq(0.1,1,0.1)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.035,0.065),col="white",
     main="Black - ANHPI",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1.1,
       bty="n")



#Difference in conditional means between Hispanic and Asian
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b6,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b3,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="6_3")
CI_high <-subset(CI_data$CI_upper,CI_data$diff=="6_3")

xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.035,0.065),col="white",
     main="Hispanic - ANHPI",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1.1,
       bty="n")




#Difference in conditional means between Native and Black
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b2,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="4_2")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="4_2")

xvals <- seq(0.1,1,0.1)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.04,0.06),col="white",
     main="Native - Black",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1.1,
       bty="n")



#Difference in conditional means between Native and Asian
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b3,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="4_3")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="4_3")

xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.035,0.065),col="white",
     main="Native - ANHPI",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1.1,
       bty="n")



#Difference in conditional means between Native and Hispanic
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
                        mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b6,audit$agi_cat==i)))
}

CI_low <- -1*subset(CI_data$CI_lower,CI_data$diff=="6_4")
CI_high <- -1*subset(CI_data$CI_upper,CI_data$diff=="6_4")

xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.035,0.065),col="white",
     main="Native - Hispanic",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","BIFSG","Bootstrapped CIs"), lwd=c(4,4,10), 
       col=c(clr[2],"firebrick","mistyrose"),cex=1.1,
       bty="n")





#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#+#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Bounding Exercise (Elzayn, et al.): Non-White - White
# Audit analysis restricted to filers 

audit <- subset(reg_data,reg_data$filer==1)
audit$audit_ind <- ifelse(audit$audit==TRUE,1,0)

#Define AGI categories
cutoffs <- rep(0,10)
for(i in 1:10){
  cutoffs[i] <- quantile(audit$agi,(0.1*i))
}

audit %<>%
  mutate(agi_cat = ifelse(agi<cutoffs[1],1,NA),
         agi_cat = ifelse(agi>=cutoffs[1] & agi<cutoffs[2],2,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[2] & agi<cutoffs[3],3,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[3] & agi<cutoffs[4],4,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[4] & agi<cutoffs[5],5,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[5] & agi<cutoffs[6],6,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[6] & agi<cutoffs[7],7,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[7] & agi<cutoffs[8],8,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[8] & agi<cutoffs[9],9,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[9],10,agi_cat))

#Difference in conditional means between black and white

#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b2,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="2_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="2_1")


#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[2]

}  
  
#X-values for graph
xvals <- seq(0.1,1,0.1)

#Graph audit Take-up truth versus predicted
par(mfrow=c(2,2))
par(bg=NA)

#Graphing colors
library(RColorBrewer)
clr <- brewer.pal(8,"Set1")

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.02,0.06),col="white",
     main="Black - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")




#Difference in conditional means between hispanic and white

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b6,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="6_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="6_1")


#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[6]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.02,0.06),col="white",
     main="Hispanic - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between ANHPI and white

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b3,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="3_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="3_1")

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,col="white", ylim=c(-0.02,0.06),
     main="ANHPI - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.3)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between Native and white

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==1 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b1,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b1,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b2 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[4]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

CI_low <- subset(CI_data$CI_lower,CI_data$diff=="4_1")
CI_high <- subset(CI_data$CI_upper,CI_data$diff=="4_1")

# Draw the two bar chart using above datasets
plot(xvals,true_val,col="white", ylim=c(-0.02,0.06),
     main="Native - White",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.1, cex.main=1.3)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")






#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Bounding Exercise (Elzayn, et al.): Non-White - Non-White

audit <- subset(reg_data,reg_data$filer==1)
audit$audit_ind <- ifelse(audit$audit==TRUE,1,0)

#Define AGI categories

cutoffs <- rep(0,10)
for(i in 1:10){
  cutoffs[i] <- quantile(audit$agi,(0.1*i))
}

audit %<>%
  mutate(agi_cat = ifelse(agi<cutoffs[1],1,NA),
         agi_cat = ifelse(agi>=cutoffs[1] & agi<cutoffs[2],2,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[2] & agi<cutoffs[3],3,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[3] & agi<cutoffs[4],4,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[4] & agi<cutoffs[5],5,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[5] & agi<cutoffs[6],6,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[6] & agi<cutoffs[7],7,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[7] & agi<cutoffs[8],8,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[8] & agi<cutoffs[9],9,agi_cat),
         agi_cat = ifelse(agi>=cutoffs[9],10,agi_cat))


#++++++++++++ Graph other differences against each other
#Difference in conditional means between Black and Hispanic

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b2,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b6,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b1 + prob_b2+ prob_b3 + prob_b4 + prob_b5, data = audit_sub)
  predict_linear[i] <- regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

#Graphing colors
library(RColorBrewer)
clr <- brewer.pal(8,"Set1")

#Graph audit Take-up truth versus predicted
par(mfrow=c(3,2))
par(bg=NA)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.015,0.045),col="white",
     main="Black - Hispanic",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between Black and Asian

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b2,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b3,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b1 + prob_b2 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[3]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.015,0.045),col="white",
     main="Black - ANHPI",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between Hispanic and Asian

#Probabilistic estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b6,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b3,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b1 + prob_b2 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[6]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.015,0.045),col="white",
     main="Hispanic - ANHPI",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")




#Difference in conditional means between Native and Black

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==2 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b2,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b2,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b1 + prob_b3 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[4]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)

# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.045,0.025),col="white",
     main="Native - Black",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.04,-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-4","-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")



#Difference in conditional means between Native and Asian

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==3 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b3,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b3,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b1 + prob_b2 + prob_b4 + prob_b5 + prob_b6, data = audit_sub)
  predict_linear[i] <- regression$coefficients[4]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.015,0.045),col="white",
     main="Native - ANHPI",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")


#Difference in conditional means between Native and Hispanic

#Probabilistic Estimator
true_val <- rep(0,10)
for(i in 1:10){
  true_val[i] <- mean(subset(audit$audit_ind,audit$demographic==4 & audit$agi_cat==i)) -
    mean(subset(audit$audit_ind,audit$demographic==6 & audit$agi_cat==i))
}

predicted <- rep(0,10)
for(i in 1:10){
  predicted[i] <- sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b4,audit$agi_cat==i)/
                             sum_narm(subset(audit$prob_b4,audit$agi_cat==i))) -
    sum_narm(subset(audit$audit_ind,audit$agi_cat==i)*subset(audit$prob_b6,audit$agi_cat==i)/
               sum_narm(subset(audit$prob_b6,audit$agi_cat==i)))
}

#Linear estimator
audit2 <- as.data.frame(audit)

predict_linear <- rep(0,10)
for(i in 1:10){
  audit_sub = subset(audit2,audit2$agi_cat==i)
  regression = lm(audit_ind ~ prob_b1 + prob_b2 + prob_b4 + prob_b5 + prob_b3, data = audit_sub)
  predict_linear[i] <- regression$coefficients[4]
  
}  

#X-values for graph
xvals <- seq(0.1,1,0.1)


# Draw the two bar chart using above datasets
plot(xvals,true_val,ylim=c(-0.015,0.045),col="white",
     main="Native - Hispanic",
     xlab = "Sample Income Decile",
     ylab="Percentage Point Difference",
     yaxt="n",xaxt="n",
     cex.lab=1.3, cex.main=1.5)
# polygon(c(xvals,rev(xvals)),c(CI_low,rev(CI_high)),col="mistyrose",border=NA)
abline(h=0,col="gray",lty=2)
lines(xvals,true_val,col=clr[2],lwd=4)
lines(xvals,predicted,col= "firebrick",lwd=4,lty=4)
lines(xvals,predict_linear,col="black",lwd=4,lty=4)
axis(side=1, at=c(0.2,0.4,0.6,0.8,1),
     labels=c("20%","40%","60%","80%","100%"))
axis(side=2, at=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03,0.04,0.05,0.06),
     labels=c("-3","-2","-1","0","1","2","3","4","5","6"))
legend("topleft", c("Observed","Probabilistic Est.","Linear Est."), lwd=c(4,4,4),
       lty=c(1,4,4),
       col=c(clr[2],"firebrick","black"),cex=1,
       bty="n")





